CNV and Depth Metrics of Chromosomes

Author

Claudia Zirión-Martínez

Published

May 1, 2025

Setup

Libraries

Code
library(tidyverse)
library(patchwork)
library(ggbeeswarm)
library(ggnewscale)
library(RColorBrewer)
library(hexbin)
library(ggExtra)
library(ggrepel)
library(truncnorm)
library(ggpattern)
library(svglite)

Paths

Code
metadata_path <-
    "data/processed/metadata_ashton_desj_all_weavepop_H99.csv"
cnv_chroms_path <-
    "../Crypto_Desjardins_Ashton/results/02.Dataset/cnv/cnv_chromosomes.tsv"
chrom_metrics_out_path <-
    "results/tables/chrom_metrics.tsv"
ploidy_path <- "results/tables/ploidy_from_plots.tsv"

Prepare dataset

Metadata

Use the metadata table that has all the samples included in the final Crypto_Desjardins_Ashton dataset (n = 1055) and H99.
Select needed columns, remove H99 and get the number of samples per dataset and lineage, per lineage, and total.

Code
metadata <- read.delim(
    metadata_path,
    header=TRUE,
    sep=",",
    stringsAsFactor = TRUE)
metadata <- metadata %>%
    select(sample, strain, source, lineage, dataset, vni_subdivision)%>%
    filter(!strain == "H99") %>%
    group_by(dataset, lineage)%>%
    mutate(samples_in_dataset_lineage = n_distinct(sample))%>%
    ungroup() %>%
    group_by(lineage)%>%
    mutate(samples_in_lineage = n_distinct(sample))%>%
    ungroup()%>%
    mutate(total_samples = n_distinct(sample))%>%
    droplevels()
Code
ploidy <- read.delim(
                ploidy_path,
                header = TRUE,
                sep = "\t",
                stringsAsFactors = FALSE)%>%
          select(sample, lineage,
                 accession, chromosome, ploidy, type, gappy, 
                 distribution_pattern, fraction_duplications,
                 steps, smile, uniform)%>%
            distinct()
    
ploidy$ploidy <- factor(ploidy$ploidy, levels = c("haploid", "diploid", "triploid", "tetraploid"))
ploidy$type <- factor(ploidy$type, levels = c("full", "large", "partial"))

CNV Regions and Depth per Chromosome

Code
cnv_chroms <- read.delim(
    cnv_chroms_path,
    header = TRUE,
    sep = "\t")%>%
    mutate(chromosome = str_pad(chromosome, 2, pad = "0"))
cnv_chroms$chromosome <- as.factor(cnv_chroms$chromosome)

levels(cnv_chroms$chromosome) <- paste("chr", levels(cnv_chroms$chromosome), sep="")

Calculate
* percent_largest_region : Percentage of length of chromosome covered by the largest CNV region of the chormosome.
* diff_span_cov: Difference between span_percent and coverage_percent.
* n_regions_100kb: Number of regions per 100kb

Code
cnv_chroms <- cnv_chroms %>%
    mutate(percent_largest_region = (size_largest_region / length)*100,
            diff_span_cov = abs(span_percent - coverage_percent),
            n_regions_100kb = (n_regions / length) * 100000)

Pivot table to get only one line per chromosome.
Join with depth per chromosome and with ploidy informations.

Code
chrom_metrics <- cnv_chroms %>%
    pivot_wider(names_from = cnv,
                values_from = c("n_regions","total_size_regions","coverage_percent",  
                            "span_percent","diff_span_cov","size_smallest_region","size_largest_region",
                            "percent_largest_region", "n_regions_100kb",
                            "std_regions_size","norm_depth_mean","norm_depth_median",   
                            "smooth_depth_mean","smooth_depth_median")) %>%
    left_join(ploidy, by = c("sample", "accession", "lineage", "chromosome"))%>%
    left_join(metadata, by = c("sample", "lineage"))

Simulate expected trend

Simulation of haploid chromosomes with all proportions (in steps of 5) of percentage of chromosome covered by deletions, duplications and single-copy regions.

Select simulation of percentage of deletion = 5.

Code
n <- 100000
sd_v <- 0.2

p <- data.frame(p_del = c(rep(seq(0,100, by = 5), each = 100)),
                p_sc  = c(rep(seq(0,100, by = 5), 100)))%>%
    mutate(sum = p_del + p_sc)%>%
    filter(sum <= 100)%>%
    distinct()
               
sample_summary <- p %>%
        mutate(p_dup = 100 - p_del - p_sc,
                n_del = p_del * n / 100,
                n_sc = p_sc * n  / 100,
                n_dup = p_dup * n  / 100)%>%
        rowwise()%>%
        mutate(median_depth = median(c(rtruncnorm(n = n_del, a = 0, mean = 0, sd = sd_v),
                                 rnorm(n_sc, mean = 1, sd = sd_v), 
                                 rnorm(n_dup, mean = 2, sd = sd_v))),
                mean_depth = mean(c(rtruncnorm(n = n_del, a = 0, mean = 0, sd = sd_v),
                                 rnorm(n_sc, mean = 1, sd = sd_v), 
                                 rnorm(n_dup, mean = 2, sd = sd_v))))%>%
        select(p_del,p_sc,p_dup, median_depth, mean_depth)%>%
        mutate(category = paste(p_del,p_sc,p_dup, sep = "_"),
                dup_filter = ifelse(p_dup >= 50, "half_duplication", "small_duplication"),
                del_filter = ifelse(p_del >= 50, "half_deletion", "small_deletion"))

trend <- filter(sample_summary, p_del == 5)

Base plot

Code
ggplot() + theme_minimal()
p <- ggplot(chrom_metrics, aes(x = coverage_percent_duplication, y = norm_chrom_median)) +
        geom_hline(yintercept = c(1, 2), color = "black", linetype = "solid") +
        geom_line(data = trend, aes(x= p_dup, y = median_depth), color = "gray")+ # Line of simulated data
        geom_point()+
        scale_x_continuous(breaks = seq(0,100, by = 10))+
        theme_minimal() +
        theme(legend.position = "bottom", legend.direction = "vertical")+
        labs(y = "Normalized Chromosome Median",
            x = "Percent of Chromosome Covered by Duplications")
p1 <- ggMarginal(
    p,
    type = "boxplot",
    margins = "both",
    size = 5,
    groupColour = FALSE,
    groupFill = FALSE
)      
p1

Filter chromosomes with some degree of duplication

To create a version of this notebook with and without the filter This version is not filtered

Ploidy and Type (Partial / Full)

Code
ggplot(chrom_metrics, aes(x = coverage_percent_duplication, y = norm_chrom_median)) +
        geom_hline(yintercept = c(1, 2), color = "black", linetype = "solid") +
        geom_line(data = trend, aes(x= p_dup, y = median_depth), color = "gray")+ # Line of simulated data
        geom_point(alpha = 0.1)+
        geom_point(aes(color = ploidy, shape = type))+
        scale_color_brewer(name = "Ploidy", palette = "Set1") +
        scale_x_continuous(breaks = seq(0,100, by = 10))+
        theme_minimal() +
        theme(legend.position = "bottom", legend.direction = "horizontal")+
        labs(y = "Normalized Chromosome Median",
            x = "Percent of Chromosome Covered by Duplications",
            shape = "Type")

Percent covered by deletion

Code
ggplot(chrom_metrics, aes(x = coverage_percent_duplication, y = norm_chrom_median))+
        geom_line(data = trend, aes(x= p_dup, y = median_depth), color = "gray")+ # Line of simulated data
        geom_hline(yintercept = c(1, 2), color = "black", linetype = "solid") +
        geom_point(aes(color = coverage_percent_deletion, shape = type))+
        scale_color_viridis_c(direction = -1) +
        scale_x_continuous(breaks = seq(0,100, by = 10))+
        theme_minimal() +
        theme(legend.position = "bottom")

Extra
Code
ggplot(chrom_metrics, aes(x = coverage_percent_duplication, y = coverage_percent_deletion))+
        geom_point(aes(color = norm_chrom_median))+
        scale_color_viridis_c(direction = -1) +
        scale_x_continuous(breaks = seq(0,100, by = 10))+
        theme_minimal() +
        theme(legend.position = "bottom")

Code
ggplot(chrom_metrics, aes(y = norm_chrom_median, x = coverage_percent_deletion))+
        geom_point(aes(color = coverage_percent_duplication))+
        scale_color_viridis_c(direction = -1) +
        theme_minimal() +
        theme(legend.position = "bottom")

Number of detected duplicated CNV regions per 100kb

Code
ggplot(chrom_metrics, aes(x = coverage_percent_duplication, y = norm_chrom_median))+
        geom_line(data = trend, aes(x= p_dup, y = median_depth), color = "gray")+ # Line of simulated data
        geom_hline(yintercept = c(1, 2), color = "black", linetype = "solid") +
        geom_point(aes(color = n_regions_100kb_duplication,shape = type))+
        scale_color_viridis_c(direction = -1) +
        scale_x_continuous(breaks = seq(0,100, by = 10))+
        theme_minimal() +
        theme(legend.position = "bottom")

Extra
Code
ggplot(chrom_metrics, aes(x = coverage_percent_duplication, y = n_regions_100kb_duplication))+
        geom_point(aes(color = norm_chrom_median))+
        scale_color_viridis_c(direction = -1) +
        scale_x_continuous(breaks = seq(0,100, by = 10))+
        theme_minimal() +
        theme(legend.position = "bottom")

Code
ggplot(chrom_metrics, aes(y = norm_chrom_median, x = n_regions_100kb_duplication))+
        geom_point(aes(color = coverage_percent_duplication))+
        scale_color_viridis_c(direction = -1) +
        theme_minimal() +
        theme(legend.position = "bottom")

Span

Code
ggplot(chrom_metrics, aes(x = coverage_percent_duplication, y = norm_chrom_median)) +
        geom_line(data = trend, aes(x= p_dup, y = median_depth), color = "gray")+ # Line of simulated data
        geom_hline(yintercept = c(1, 2), color = "black", linetype = "solid") +
        geom_point(aes(color = span_percent_duplication,shape = type)) +
        scale_color_viridis_c(direction = -1) +
        scale_x_continuous(breaks = seq(0,100, by = 10))+
        theme_minimal() +
        theme(legend.position = "bottom")

Extra
Code
ggplot(chrom_metrics, aes(x = coverage_percent_duplication, y = span_percent_duplication))+
        geom_point(aes(color = norm_chrom_median))+
        scale_color_viridis_c(direction = -1) +
        scale_x_continuous(breaks = seq(0,100, by = 10))+
        theme_minimal() +
        theme(legend.position = "bottom")

Code
ggplot(chrom_metrics, aes(y = norm_chrom_median, x = span_percent_duplication))+
        geom_point(aes(color = coverage_percent_duplication))+
        scale_color_viridis_c(direction = -1) +
        theme_minimal() +
        theme(legend.position = "bottom")

Difference between Span and Coverage

Code
ggplot(chrom_metrics, aes(x = coverage_percent_duplication, y = norm_chrom_median)) +
        geom_line(data = trend, aes(x= p_dup, y = median_depth), color = "gray")+ # Line of simulated data
        geom_hline(yintercept = c(1, 2), color = "black", linetype = "solid") +
        geom_point(aes(color = diff_span_cov_duplication,shape = type)) +
        scale_color_viridis_c(direction = -1) +
        scale_x_continuous(breaks = seq(0,100, by = 10))+
        theme_minimal() +
        theme(legend.position = "bottom")

Extra
Code
ggplot(chrom_metrics, aes(x = coverage_percent_duplication, y = diff_span_cov_duplication))+
        geom_point(aes(color = norm_chrom_median))+
        scale_color_viridis_c(direction = -1) +
        scale_x_continuous(breaks = seq(0,100, by = 10))+
        theme_minimal() +
        theme(legend.position = "bottom")

Code
ggplot(chrom_metrics, aes(y = norm_chrom_median, x = diff_span_cov_duplication))+
        geom_point(aes(color = coverage_percent_duplication))+
        scale_color_viridis_c(direction = -1) +
        theme_minimal() +
        theme(legend.position = "bottom")

Median depth of the windows that are part of the duplicated CNV regions

Code
max_color_range <- paste("2.7 -", max(chrom_metrics$norm_depth_median_duplication, na.rm = TRUE))
colors <- c("#FDE725FF","#95D840FF","#29AF7FFF","#287D8EFF","#440154FF" )
names(colors) <-c("0 - 1.5", "1.6 - 2", "2.1 - 2.3", "2.4 - 2.6", max_color_range)
ggplot(filter(chrom_metrics, smooth_depth_median_duplication < 4), 
            aes(x = coverage_percent_duplication, y = norm_chrom_median)) +
        geom_hline(yintercept = c(1, 2), color = "black", linetype = "solid") +
        geom_line(data = trend, aes(x= p_dup, y = median_depth), color = "gray")+ # Line of simulated data
        geom_point(aes(color = cut(norm_depth_median_duplication, 
                            breaks = c(-Inf, 1.5, 2, 2.3, 2.6, Inf), 
                            labels = names(colors)),
                            shape = type),
                            alpha = 1) +
        scale_color_manual(values = colors, 
                            name = "Median Depth of dupCNV\nWindows in Chromosome")+
        scale_x_continuous(breaks = seq(0,100, by = 10))+
        theme_minimal() +
        theme(legend.position = "bottom", legend.direction = "horizontal")+
        labs(y = "Normalized Chromosome Median",
            x = "Percent of Chromosome Covered by Duplications")
Warning: Removed 8874 rows containing missing values or values outside the scale range
(`geom_point()`).

Extra
Code
ggplot(chrom_metrics, aes(x = coverage_percent_duplication, y = norm_depth_median_duplication))+
        geom_point(aes(color = norm_chrom_median))+
        scale_color_viridis_c(direction = -1) +
        scale_x_continuous(breaks = seq(0,100, by = 10))+
        theme_minimal() +
        theme(legend.position = "bottom")

Code
ggplot(chrom_metrics, aes(y = norm_chrom_median, x = norm_depth_median_duplication))+
        geom_abline(intercept = 0, slope = 1, color = "gray", linetype = "solid")+
        geom_point(aes(color = coverage_percent_duplication))+
        scale_color_viridis_c(direction = -1) +
        theme_minimal() +
        theme(legend.position = "bottom")

Code
ggplot(chrom_metrics, aes(x = coverage_percent_duplication, y = norm_depth_median_duplication))+
        geom_point(aes(color = norm_chrom_median))+
        scale_color_viridis_c(direction = -1) +
        scale_x_continuous(breaks = seq(0,100, by = 10))+
        scale_y_continuous(limits = c(0,5))+
        theme_minimal() +
        theme(legend.position = "bottom")

Code
ggplot(chrom_metrics, aes(y = norm_chrom_median, x = norm_depth_median_duplication))+
        geom_abline(intercept = 0, slope = 1, color = "gray", linetype = "solid")+
        geom_point(aes(color = coverage_percent_duplication))+
        scale_color_viridis_c(direction = -1) +
        scale_x_continuous(limits = c(0,5))+
        theme_minimal() +
        theme(legend.position = "bottom")

Smoothed median depth of the windows that are part of the duplicated CNV regions

Code
colors <- c("#FDE725FF","#95D840FF","#29AF7FFF","#287D8EFF","#440154FF" )
names(colors) <-c("0 - 1.5", "1.6 - 2", "2.1 - 2.3", "2.4 - 2.6", "2.7 - 4")
ggplot(filter(chrom_metrics, smooth_depth_median_duplication < 4), 
            aes(x = coverage_percent_duplication, y = norm_chrom_median)) +
        geom_hline(yintercept = c(1, 2), color = "black", linetype = "solid") +
        geom_line(data = trend, aes(x= p_dup, y = median_depth), color = "gray")+ # Line of simulated data
        geom_point(aes(color = cut(smooth_depth_median_duplication, 
                            breaks = c(-Inf, 1.5, 2, 2.3, 2.6, Inf), 
                            labels = names(colors)),
                            shape = type),
                            alpha = 1) +
        scale_color_manual(values = colors, 
                            name = "Smooth Median Depth of dupCNV\nWindows in Chromosome")+
        scale_x_continuous(breaks = seq(0,100, by = 10))+
        theme_minimal() +
        theme(legend.position = "bottom", legend.direction = "horizontal")+
        labs(y = "Normalized Chromosome Median",
            x = "Percent of Chromosome Covered by Duplications")
Warning: Removed 8874 rows containing missing values or values outside the scale range
(`geom_point()`).

Extra
Code
ggplot(chrom_metrics, aes(x = coverage_percent_duplication, y = smooth_depth_median_duplication))+
        geom_point(aes(color = norm_chrom_median))+
        scale_color_viridis_c(direction = -1) +
        scale_x_continuous(breaks = seq(0,100, by = 10))+
        theme_minimal() +
        theme(legend.position = "bottom")

Code
ggplot(chrom_metrics, aes(y = norm_chrom_median, x = smooth_depth_median_duplication))+
        geom_point(aes(color = coverage_percent_duplication))+
        scale_color_viridis_c(direction = -1) +
        theme_minimal() +
        theme(legend.position = "bottom")

Code
ggplot(chrom_metrics, aes(x = coverage_percent_duplication, y = smooth_depth_median_duplication))+
        geom_point(aes(color = norm_chrom_median))+
        scale_color_viridis_c(direction = -1) +
        scale_x_continuous(breaks = seq(0,100, by = 10))+
        scale_y_continuous(limits = c(0,5))+
        theme_minimal() +
        theme(legend.position = "bottom")

Code
ggplot(chrom_metrics, aes(y = norm_chrom_median, x = smooth_depth_median_duplication))+
        geom_point(aes(color = coverage_percent_duplication))+
        scale_color_viridis_c(direction = -1) +
        scale_x_continuous(limits = c(0,5))+
        theme_minimal() +
        theme(legend.position = "bottom")

Percent of chromosome covered by the largest single-copy region

Code
colors <- c("#FDE725FF","#95D840FF","#29AF7FFF","#287D8EFF","#440154FF" )
names(colors) <-c("0 - 5", "6 - 10", "11 -15", "16 - 20", "21 - 100")

ggplot(chrom_metrics, aes(x = coverage_percent_duplication, y = norm_chrom_median)) +
        geom_line(data = trend, aes(x= p_dup, y = median_depth), color = "gray")+ # Line of simulated data
        geom_hline(yintercept = c(1, 2), color = "black", linetype = "solid") +
        # geom_point(aes(color = percent_largest_region_single_copy))+
        # scale_color_viridis_c(direction = -1)+
        geom_point(aes(color = cut(percent_largest_region_single_copy, 
                           breaks = c(0,5,10,15,20,100), 
                            labels = names(colors)),
                            shape = type)) + 
        scale_color_manual(values = colors, 
                            name = "Largest Single-Copy Region")+
        scale_x_continuous(breaks = seq(0,100, by = 10))+
        theme_minimal() +
        theme(legend.position = "bottom", legend.direction = "vertical")

Extra
Code
ggplot(chrom_metrics, aes(x = coverage_percent_duplication, y = percent_largest_region_single_copy))+
        geom_point(aes(color = norm_chrom_median))+
        scale_color_viridis_c(direction = -1) +
        scale_x_continuous(breaks = seq(0,100, by = 10))+
        theme_minimal() +
        theme(legend.position = "bottom")

Code
ggplot(chrom_metrics, aes(y = norm_chrom_median, x = percent_largest_region_single_copy))+
        geom_point(aes(color = coverage_percent_duplication))+
        scale_color_viridis_c(direction = -1) +
        theme_minimal() +
        theme(legend.position = "bottom")

Names

Code
ggplot(filter(chrom_metrics, coverage_percent_duplication >= 20 | norm_chrom_median >= 1.4), 
                aes(x = coverage_percent_duplication, y = norm_chrom_median)) +
        geom_hline(yintercept = c(1, 2), color = "black", linetype = "solid") +
        geom_line(data = trend, aes(x= p_dup, y = median_depth), color = "gray")+ # Line of simulated data
        geom_point(alpha = 0.5, aes(color = lineage,shape = type))+
        geom_text_repel(aes(label = sample), size = 3, max.overlaps = 10) +
        scale_x_continuous(limits = c(0,100), breaks = seq(0,100, by = 10))+
        theme_minimal() +
        theme(legend.position = "bottom", legend.direction = "horizontal")+
        labs(y = "Normalized Chromosome Median",
            x = "Percent of Chromosome Covered by Duplications")

Names, lower left

Code
ggplot(filter(chrom_metrics, coverage_percent_duplication >= 20 | norm_chrom_median >= 1.4), 
                aes(x = coverage_percent_duplication, y = norm_chrom_median)) +
        geom_hline(yintercept = c(1, 2), color = "black", linetype = "solid") +
        geom_line(data = trend, aes(x= p_dup, y = median_depth), color = "gray")+ # Line of simulated data
        geom_point(alpha = 0.5, aes(color = lineage,shape = type))+
        geom_text_repel(aes(label = paste(sample, chromosome)), size = 3, max.overlaps = 10) +
        scale_x_continuous(limits = c(0,50),breaks = seq(0,50, by = 5))+
        scale_y_continuous(limits = c(1, 1.6))+
        theme_minimal() +
        theme(legend.position = "bottom", legend.direction = "horizontal")+
        labs(y = "Normalized Chromosome Median",
            x = "Percent of Chromosome Covered by Duplications")

Names, middle right

Code
ggplot(filter(chrom_metrics, coverage_percent_duplication >= 20 | norm_chrom_median >= 1.4), 
                aes(x = coverage_percent_duplication, y = norm_chrom_median)) +
        geom_hline(yintercept = c(1, 2), color = "black", linetype = "solid") +
        geom_line(data = trend, aes(x= p_dup, y = median_depth), color = "gray")+ # Line of simulated data
        geom_point(alpha = 0.5, aes(color = lineage,shape = type))+
        geom_text_repel(aes(label = paste(sample, chromosome)), size = 3, max.overlaps = 10) +
        scale_x_continuous(limits = c(50,100),breaks = seq(50,100, by = 5))+
        scale_y_continuous(limits = c(1.4,2))+
        theme_minimal() +
        theme(legend.position = "bottom", legend.direction = "horizontal")+
        labs(y = "Normalized Chromosome Median",
            x = "Percent of Chromosome Covered by Duplications")

Names, upper right

Code
ggplot(filter(chrom_metrics, coverage_percent_duplication >= 20 | norm_chrom_median >= 1.4), 
                aes(x = coverage_percent_duplication, y = norm_chrom_median)) +
        geom_hline(yintercept = c(1, 2), color = "black", linetype = "solid") +
        geom_line(data = trend, aes(x= p_dup, y = median_depth), color = "gray")+ # Line of simulated data
        geom_point(alpha = 0.5, aes(color = lineage,shape = type))+
        geom_text_repel(aes(label = paste(sample, chromosome)), size = 3, max.overlaps = 20) +
        scale_x_continuous(limits = c(50,100),breaks = seq(50,100, by = 5))+
        scale_y_continuous(limits = c(2,2.6))+
        theme_minimal() +
        theme(legend.position = "bottom", legend.direction = "horizontal")+
        labs(y = "Normalized Chromosome Median",
            x = "Percent of Chromosome Covered by Duplications")

Filter out partial duplications

coverage_threshold <- 80
Code
duplications_full <- filter(chrom_metrics, coverage_percent_duplication >= coverage_threshold)

Summary tables

Number of samples with duplications in each lineage

Code
dup_lineage <- duplications_full %>%
    group_by(lineage) %>%
    summarise(aneuploid = n_distinct(sample), samples_in_lineage = first(samples_in_lineage))%>%
    mutate(euploid = samples_in_lineage - aneuploid)%>%
    pivot_longer(cols = c(aneuploid, euploid), names_to = "type", values_to = "n_samples")%>%
    mutate(percent = round((n_samples / samples_in_lineage) * 100, 1))%>%
    mutate(dup_samples = ifelse(type == "aneuploid", n_samples, ""))
dup_lineage$type <- factor(dup_lineage$type, levels = c("euploid","aneuploid"))
                
dup_lineage %>%
        filter(type == "aneuploid")%>%
        select(lineage, n_samples, percent, samples_in_lineage)
lineage n_samples percent samples_in_lineage
VNBI 2 1.6 122
VNBII 3 4.7 64
VNI 37 4.3 853
VNII 3 18.8 16

Number of samples with duplications in each lineage-source

Code
metadata_source_lineage <- metadata %>%
        group_by(lineage,source,samples_in_lineage) %>%
        summarize(n_lin_source = n_distinct(sample))


dup_lineage_source <- duplications_full %>%
        group_by(lineage,source) %>%
        summarise(aneuploid = n_distinct(sample))%>%
        right_join(metadata_source_lineage, by = c("lineage", "source"))%>%
        replace_na(list(aneuploid = 0))%>%
        arrange(lineage, source)%>%
        mutate(euploid = n_lin_source - aneuploid)%>%
        select(lineage, source, aneuploid, euploid, n_lin_source, samples_in_lineage)%>%
        pivot_longer(cols = c(aneuploid, euploid), names_to = "type", values_to = "n_samples")%>%
        mutate(percent_samples = round((n_samples / samples_in_lineage) * 100, 1),
                percent_source = round((n_lin_source / samples_in_lineage)*100,1),
                dup_samples = ifelse(type == "aneuploid" & source == "Clinical", n_samples, ""),
                type = str_to_title(type),
                category = paste(source, type, sep = " "))

dup_lineage_source$type <- factor(dup_lineage_source$type, levels = c("Euploid","Aneuploid"))
dup_lineage_source$source <- factor(dup_lineage_source$source, levels = c("Environmental","Clinical"))
dup_lineage_source$category <- factor(dup_lineage_source$category, levels =c("Environmental Euploid", "Environmental Aneuploid","Clinical Euploid","Clinical Aneuploid"))

source <- dup_lineage_source%>%
        select(lineage, source, n_lin_source, percent_source)%>%
        distinct
                
dup_lineage_source  %>%
        filter(type == "Aneuploid")%>%
        select(lineage, source, n_samples, percent_samples, n_lin_source)
lineage source n_samples percent_samples n_lin_source
VNBI Clinical 2 1.6 51
VNBI Environmental 0 0.0 71
VNBII Clinical 3 4.7 61
VNBII Environmental 0 0.0 3
VNI Clinical 37 4.3 825
VNI Environmental 0 0.0 28
VNII Clinical 3 18.8 15
VNII Environmental 0 0.0 1

Number of duplicated chromosomes per sample

Code
dup_sample <- duplications_full %>%
    group_by(dataset,lineage, sample, strain, source) %>%
    summarise(n_chroms = n_distinct(chromosome), 
            chromosomes = paste(chromosome, collapse = ", ")) %>%
    arrange(desc(n_chroms))
dup_sample
dataset lineage sample strain source n_chroms chromosomes
Desjardins VNI SRS405109 Bt92 Clinical 2 chr04, chr13
Desjardins VNII SRS417641 C12 Clinical 2 chr12, chr14
Ashton VNI ERS1142697 20427_2#6 Clinical 1 chr12
Ashton VNI ERS1142778 20949_2#15 Clinical 1 chr13
Ashton VNI ERS1142807 20427_3#21 Clinical 1 chr12
Ashton VNI ERS1142815 20427_3#26 Clinical 1 chr09
Ashton VNI ERS1142819 20427_3#30 Clinical 1 chr12
Ashton VNI ERS1142869 20949_2#42 Clinical 1 chr09
Ashton VNI ERS1142878 20427_4#13 Clinical 1 chr09
Ashton VNI ERS2540945 04CN-65-072 Clinical 1 chr13
Ashton VNI ERS2540951 04CN-64-024 Clinical 1 chr01
Ashton VNI ERS2540980 04CN-64-011 Clinical 1 chr13
Ashton VNI ERS2540986 04CN-65-001 Clinical 1 chr12
Ashton VNI ERS2541044 04CN-64-074 Clinical 1 chr12
Ashton VNI ERS2541126 BMD3144 Clinical 1 chr13
Ashton VNI ERS2541138 04CN-03-053 Clinical 1 chr12
Ashton VNI ERS2541212 04CN-03-039 Clinical 1 chr13
Ashton VNI ERS2541247 BMD2209 Clinical 1 chr06
Ashton VNI ERS2541251 04CN-03-081 Clinical 1 chr06
Ashton VNI ERS2541264 BMD3117 Clinical 1 chr12
Ashton VNI ERS2541274 BMD761 Clinical 1 chr13
Ashton VNI ERS2541304 04CN-65-056 Clinical 1 chr01
Ashton VNI ERS2541312 04CN-32-011 Clinical 1 chr06
Ashton VNI ERS2541317 04CN-64-090 Clinical 1 chr12
Ashton VNI ERS2541342 UI_31647-2 Clinical 1 chr12
Ashton VNI ERS542334 14892_1#38 Clinical 1 chr06
Ashton VNI ERS542397 14936_1#6 Clinical 1 chr13
Ashton VNI ERS542496 14893_1#10 Clinical 1 chr13
Desjardins VNBI SRS885841 NRHc5009.REL.INI Clinical 1 chr14
Desjardins VNBI SRS885893 NRHc5045.ENR.CLIN.ISO Clinical 1 chr13
Desjardins VNBII SRS417606 Bt109 Clinical 1 chr09
Desjardins VNBII SRS417652 MW-RSA2967 Clinical 1 chr12
Desjardins VNBII SRS881180 PMHc1029.ENR.STOR Clinical 1 chr12
Desjardins VNI SRS404475 LP-RSA3042 Clinical 1 chr14
Desjardins VNI SRS404481 Bt139 Clinical 1 chr13
Desjardins VNI SRS404518 Bt134 Clinical 1 chr12
Desjardins VNI SRS404772 Bt141 Clinical 1 chr09
Desjardins VNI SRS404793 MW-RSA6134 Clinical 1 chr12
Desjardins VNI SRS405145 Bt117 Clinical 1 chr09
Desjardins VNI SRS409064 MW-RSA1955 Clinical 1 chr12
Desjardins VNI SRS417642 In2632 Clinical 1 chr12
Desjardins VNI SRS520178 MW-RSA3834 Clinical 1 chr12
Desjardins VNI SRS885916 PMHc1031A.ENR.INI.LP Clinical 1 chr12
Desjardins VNII SRS417571 8-1 Clinical 1 chr14
Desjardins VNII SRS520182 WM626 Clinical 1 chr14

Number of samples with duplications in each group of dataset-lineage-chromosome

Code
dup_dataset_lineage_chromosome <- duplications_full %>%
    group_by(dataset,lineage, chromosome) %>%
    summarise(n_samples = n_distinct(sample), 
        samples_in_dataset_lineage = first(samples_in_dataset_lineage))%>%
    mutate(percent_samples = round((n_samples / samples_in_dataset_lineage) * 100, 1))%>%
    select(dataset,lineage, chromosome, n_samples, samples_in_dataset_lineage, percent_samples)%>%
    arrange(chromosome, desc(lineage), desc(n_samples))
dup_dataset_lineage_chromosome
dataset lineage chromosome n_samples samples_in_dataset_lineage percent_samples
Ashton VNI chr01 2 668 0.3
Desjardins VNI chr04 1 185 0.5
Ashton VNI chr06 4 668 0.6
Ashton VNI chr09 3 668 0.4
Desjardins VNI chr09 2 185 1.1
Desjardins VNBII chr09 1 64 1.6
Desjardins VNII chr12 1 16 6.2
Ashton VNI chr12 9 668 1.3
Desjardins VNI chr12 6 185 3.2
Desjardins VNBII chr12 2 64 3.1
Ashton VNI chr13 8 668 1.2
Desjardins VNI chr13 2 185 1.1
Desjardins VNBI chr13 1 122 0.8
Desjardins VNII chr14 3 16 18.8
Desjardins VNI chr14 1 185 0.5
Desjardins VNBI chr14 1 122 0.8

Number of samples with duplications in each group of lineage-chromosome

Code
dup_lineage_chromosome <- duplications_full%>%
    group_by(lineage, chromosome) %>%
    summarise(n_samples = n_distinct(sample), 
        samples_in_lineage = first(samples_in_lineage))%>%
    mutate(percent_samples = round((n_samples / samples_in_lineage) * 100, 1))%>%
    select(lineage, chromosome, n_samples, samples_in_lineage,percent_samples)%>%
    arrange(chromosome, desc(lineage), desc(n_samples))
dup_lineage_chromosome 
lineage chromosome n_samples samples_in_lineage percent_samples
VNI chr01 2 853 0.2
VNI chr04 1 853 0.1
VNI chr06 4 853 0.5
VNI chr09 5 853 0.6
VNBII chr09 1 64 1.6
VNII chr12 1 16 6.2
VNI chr12 15 853 1.8
VNBII chr12 2 64 3.1
VNI chr13 10 853 1.2
VNBI chr13 1 122 0.8
VNII chr14 3 16 18.8
VNI chr14 1 853 0.1
VNBI chr14 1 122 0.8

Number of samples with duplications in each group of lineage-dataset

Code
dup_lineage_dataset <- duplications_full%>%
    group_by(dataset,lineage) %>%
    summarise(n_samples = n_distinct(sample), samples_in_dataset_lineage = first(samples_in_dataset_lineage))%>%
    mutate(percent_samples = round((n_samples / samples_in_dataset_lineage) * 100, 1))%>%
    select(lineage, n_samples, samples_in_dataset_lineage, percent_samples)%>%
    arrange(desc(lineage), desc(n_samples))
dup_lineage_dataset
dataset lineage n_samples samples_in_dataset_lineage percent_samples
Desjardins VNII 3 16 18.8
Ashton VNI 26 668 3.9
Desjardins VNI 11 185 5.9
Desjardins VNBII 3 64 4.7
Desjardins VNBI 2 122 1.6

Number of samples with duplications in each chromosome

Code
dup_chromosome <- duplications_full %>%
    group_by(chromosome) %>%
    summarise(n_samples = n_distinct(sample), total_samples = first(total_samples))%>%
    mutate(percent_samples = round((n_samples / total_samples) * 100, 1))%>%
    select(chromosome, n_samples,total_samples, percent_samples)%>%
    arrange(chromosome, desc(n_samples))
dup_chromosome 
chromosome n_samples total_samples percent_samples
chr01 2 1055 0.2
chr04 1 1055 0.1
chr06 4 1055 0.4
chr09 6 1055 0.6
chr12 18 1055 1.7
chr13 11 1055 1.0
chr14 5 1055 0.5